Accreditation exercise for 16S rRNA data analysis

setwd("/Users/macbook/Desktop/16S_rna_analysis")

Data Sources and description

require(kableExtra)
require(phyloseq)
phy <- readRDS("data/physeq_silva_90")
meta <- data.frame(sample_data(phy))
meta %>% kable(booktabs = TRUE, longtable = TRUE) %>% kable_styling(latex_options = c("hold_position", 
    "repeat_header", position = "")) %>% scroll_box(width = "100%", height = "300px")
Inflammation BV Age BMI sample
S1 Low Negative 19 22.48100 S1
S100 High Positive 17 27.82931 S100
S101 High Positive 19 NA S101
S102 High Negative 18 28.95900 S102
S103 High Negative 20 26.95312 S103
S104 Low Negative 18 27.05380 S104
S105 Low Negative 18 20.69049 S105
S106 High Positive 17 21.21832 S106
S107 High Negative 22 22.49964 S107
S108 High Positive 20 NA S108
S109 High Positive 19 34.85018 S109
S11 Low Negative 20 18.87100 S11
S111 Low Negative 22 23.35564 S111
S112 High Positive 21 24.65483 S112
S114 High Positive 20 29.27796 S114
S115 High Positive 19 28.54828 S115
S116 High Positive 20 21.77844 S116
S117 High Positive 20 29.51594 S117
S118 High Positive 19 22.58271 S118
S119 High Negative 17 17.79993 S119
S120 High Negative 22 35.10667 S120
S121 High Positive 17 31.55556 S121
S122 Low Negative 17 25.39062 S122
S123 High Positive 16 21.05171 S123
S124 High Positive 17 25.21736 S124
S126 High Positive 17 27.54821 S126
S127 Low Positive 20 30.47052 S127
S128 High Negative 17 19.14672 S128
S129 Low Negative 16 22.32143 S129
S130 Low Negative 16 26.40235 S130
S131 Low Negative 19 25.15315 S131
S132 NA Positive 16 21.09619 S132
S134 Low Negative 20 18.13122 S134
S135 High Positive 19 30.48780 S135
S137 Low Negative 18 28.36035 S137
S138 High Positive 16 18.61150 S138
S139 High Positive 18 27.25089 S139
S14 Low Negative NA 29.01700 S14
S140 Low Negative 18 21.87500 S140
S141 High Positive 20 27.55556 S141
S142 Low Negative 20 22.49964 S142
S143 High Positive 18 NA S143
S145 Low Negative 18 28.44444 S145
S15 Low Negative NA 16.18400 S15
S16 Low Negative 16 21.35800 S16
S17 Low Negative 17 27.66000 S17
S18 Low Negative 16 30.35900 S18
S2 Low Positive NA NA S2
S21 High Negative 17 22.50000 S21
S24 Low Negative 16 31.08000 S24
S25 Low Positive NA 20.96400 S25
S26 Low Negative 16 19.23400 S26
S28 Low Negative 17 24.84100 S28
S29 Low Negative 17 20.83000 S29
S3 High Negative 18 22.15100 S3
S31 High Positive 18 19.48700 S31
S33 High Negative 21 34.96400 S33
S34 High Positive 21 18.80800 S34
S35 High Positive NA 22.03200 S35
S37 High Positive NA 22.86200 S37
S38 Low Negative 17 23.55600 S38
S39 Low Negative 17 26.02600 S39
S4 High Positive 19 29.37200 S4
S40 Low Negative 20 25.96500 S40
S42 High Positive 16 23.50800 S42
S43 Low Negative 17 26.39800 S43
S44 Low Negative 16 22.89300 S44
S46 Low Negative 16 20.56900 S46
S47 Low Positive 18 34.92800 S47
S48 Low Negative 18 23.62400 S48
S50 High Negative NA 20.44900 S50
S51 High Negative 19 22.47700 S51
S52 High Positive 21 16.75700 S52
S53 Low Negative 18 22.60000 S53
S54 High Positive 19 40.00900 S54
S55 Low Positive 19 24.46500 S55
S56 Low Negative 18 18.31400 S56
S57 Low Positive 21 21.33800 S57
S58 High Positive 19 21.33800 S58
S59 High Positive 21 21.33800 S59
S60 Low Positive 19 23.04700 S60
S61 Low Positive 19 26.49200 S61
S62 High Positive 19 21.60400 S62
S63 High Positive 18 21.21800 S63
S64 High Positive 22 19.83471 S64
S65 High Negative 19 NA S65
S66 Low Negative 19 36.10694 S66
S67 Low Negative 18 24.80159 S67
S68 High Positive 19 22.98145 S68
S69 High Positive 17 26.03749 S69
S71 High Positive 19 33.05785 S71
S72 Low Positive 20 30.07598 S72
S74 High Negative 18 27.35885 S74
S75 High Negative 18 20.07775 S75
S77 High Negative 18 26.31464 S77
S78 Low Positive 18 21.71925 S78
S79 High Positive 21 30.17882 S79
S8 NA Negative 19 28.76400 S8
S80 High Negative 19 19.72387 S80
S81 Low Negative 20 28.35306 S81
S82 High Positive 18 23.42209 S82
S83 High Positive 20 25.00000 S83
S84 High Positive 18 20.90420 S84
S85 High Positive 19 25.72242 S85
S86 Low Negative 19 19.47341 S86
S88 Low Negative 17 22.83288 S88
S89 High Positive 18 25.71101 S89
S9 Low Negative 22 21.75500 S9
S90 High Negative 18 29.27796 S90
S92 Low Negative 20 20.83000 S92
S93 High Positive 18 36.64982 S93
S94 High Positive 16 23.82812 S94
S95 High Negative 17 25.14861 S95
S96 High Positive 16 28.87544 S96
S97 High Positive 17 17.36044 S97
S98 High Positive 18 21.07720 S98

Denosing stats

require(kableExtra)
stats <- read.table("data/stats.tsv", header = T)
stats %>% kable(booktabs = TRUE, longtable = TRUE) %>% kable_styling(latex_options = c("hold_position", 
    "repeat_header", position = "")) %>% scroll_box(width = "100%", height = "300px")
sample.id input filtered denoised merged non.chimeric
S1 108494 71218 71218 64963 64963
S100 126661 48555 48555 47328 47328
S101 150614 107251 107251 106154 106154
S102 146772 121474 121474 119994 119994
S103 366244 305258 305258 301238 301238
S104 339681 273582 273582 266178 266178
S105 176537 130659 130659 127538 127538
S106 409963 322929 322929 315569 315569
S107 162844 104936 104936 102849 102849
S108 320033 252510 252510 249186 249186
S109 322310 260472 260472 256255 256255
S11 111790 82721 82721 76572 76572
S111 282144 239108 239108 236964 236964
S112 49460 36709 36709 36462 36462
S114 331129 274093 274093 269072 269072
S115 109200 88749 88749 87596 87596
S116 205450 141207 141207 138321 138321
S117 254884 206061 206061 201946 201946
S118 28687 14662 14662 14361 14361
S119 350937 234338 234338 226540 226540
S120 1532253 1281379 1281379 1263436 1263436
S121 345921 252646 252646 248813 248813
S122 353419 295442 295442 292978 292978
S123 225713 179217 179217 176567 176567
S124 140088 106449 106449 102541 102541
S126 380866 285061 285061 280926 280926
S127 536496 422807 422807 417772 417772
S128 210601 175092 175092 174022 174022
S129 90546 68813 68813 68253 68253
S130 387131 319617 319617 313316 313316
S131 217614 170242 170242 167290 167290
S132 133395 89377 89377 84978 84978
S134 141947 100908 100908 93233 93233
S135 303167 251788 251788 247979 247979
S137 83077 53328 53328 49049 49049
S138 110492 80605 80605 77501 77501
S139 361408 302110 302110 298271 298271
S14 215017 183251 183251 167757 167757
S140 33079 22371 22371 21430 21430
S141 161665 112579 112579 108088 108088
S142 45924 34647 34647 32183 32183
S143 301794 255139 255139 251130 251130
S145 106428 78663 78663 73366 73366
S15 207191 168396 168396 153352 153352
S16 51951 40834 40834 37945 37945
S17 226827 184346 184346 168521 168521
S18 70709 44884 44884 41018 41018
S2 258746 193555 193555 172767 172767
S21 14155 11069 11069 10453 10453
S24 104447 75697 75697 70193 70193
S25 154810 119846 119846 111091 111091
S26 134898 103978 103978 95340 95340
S28 54256 43261 43261 40348 40348
S29 92226 73108 73108 68153 68153
S3 29696 21821 21821 20087 20087
S31 98386 81770 81770 78054 78054
S33 30434 24556 24556 23666 23666
S34 231852 173395 173395 156773 156773
S35 139023 111869 111869 102136 102136
S37 186767 136813 136813 122851 122851
S38 155810 101964 101964 92594 92594
S39 114609 89616 89616 79116 79116
S4 176262 144419 144419 135466 135466
S40 102954 72076 72076 66254 66254
S42 161516 130610 130610 121306 121306
S43 26710 22748 22748 21109 21109
S44 89349 64294 64294 58984 58984
S46 96397 69206 69206 64897 64897
S47 188461 156130 156130 147590 147590
S48 31718 19839 19839 18481 18481
S50 114256 88893 88893 81813 81813
S51 103734 77405 77405 71159 71159
S52 17009 12460 12460 11937 11937
S53 160395 129052 129052 118501 118501
S54 193042 155351 155351 145664 145664
S55 183286 142289 142289 129107 129107
S56 133249 93221 93221 87063 87063
S57 229468 143550 143550 126131 126131
S58 107133 84806 84806 80997 80997
S59 203605 168385 168385 157877 157877
S60 138892 100992 100992 94710 94710
S61 172758 138556 138556 130388 130388
S62 228758 177554 177554 159343 159343
S63 131697 92289 92289 86373 86373
S64 151183 114620 114620 113429 113429
S65 228293 178060 178060 176375 176375
S66 106851 68507 68507 64296 64296
S67 247723 205893 205893 203492 203492
S68 70210 57256 57256 56408 56408
S69 197006 150993 150993 148130 148130
S71 167183 124266 124266 122163 122163
S72 160242 120510 120510 115616 115616
S74 156178 107160 107160 105434 105434
S75 207264 164096 164096 162241 162241
S77 117901 84577 84577 83246 83246
S78 231741 172630 172630 170057 170057
S79 140625 100372 100372 99358 99358
S8 118954 65831 65831 60053 60053
S80 96400 27251 27251 26944 26944
S81 46643 22578 22578 22018 22018
S82 312725 247048 247048 244371 244371
S83 217518 177949 177949 168845 168845
S84 270208 232543 232543 230945 230945
S85 342197 246347 246347 238033 238033
S86 80376 49979 49979 48693 48693
S88 42682 26392 26392 24403 24403
S89 176154 139611 139611 137884 137884
S9 79870 68637 68637 63332 63332
S90 66713 35239 35239 31476 31476
S92 74331 48971 48971 48454 48454
S93 268130 212003 212003 206614 206614
S94 93202 43708 43708 43394 43394
S95 31443 12006 12006 11667 11667
S96 227880 183185 183185 181108 181108
S97 317252 268332 268332 266395 266395
S98 396734 313305 313305 309626 309626
mean(stats$input)
[1] 185871.8
mean(stats$filtered)
[1] 143041.2
mean(stats$merged)
[1] 137931.7
mean(stats$non.chimeric)
[1] 137931.7
lost_filt <- (mean(stats$input) - mean(stats$filtered))/mean(stats$input)
lost_filt
[1] 0.2304308
lost_merge <- (mean(stats$filtered) - mean(stats$merged))/mean(stats$filtered)
lost_merge
[1] 0.03572084
lost_chime <- (mean(stats$merged) - mean(stats$non.chimeric))/mean(stats$merged)
lost_chime
[1] 0
lost_overall <- (mean(stats$input) - mean(stats$non.chimeric))/mean(stats$input)
lost_overall
[1] 0.2579204


require(plotly)
df <- stats[, -c(1)]  #sample ids
p <- ggplot(stack(df), aes(x = ind, y = values)) + ylim(0, 5e+05) + geom_boxplot(col = "blue", 
    outline = FALSE) + theme_bw() + ylab("Number of reads") + xlab("")
ggplotly(p)

Detected ASvs

df <- as.data.frame(sample_sums(phy))
colnames(df) <- c("Number_of_ASVs")
df %>% kable(booktabs = TRUE, longtable = TRUE) %>% kable_styling(latex_options = c("hold_position", 
    "repeat_header", position = "")) %>% scroll_box(width = "100%", height = "200px")
Number_of_ASVs
S1 63230
S100 47012
S101 101156
S102 111762
S103 280956
S104 263280
S105 125068
S106 304143
S107 102080
S108 235556
S109 240091
S11 73437
S111 234357
S112 35362
S114 241630
S115 64631
S116 127471
S117 194595
S118 11727
S119 215206
S120 1075421
S121 233627
S122 287572
S123 165222
S124 91555
S126 260483
S127 400493
S128 171973
S129 65591
S130 290641
S131 166106
S132 70909
S134 81964
S135 241499
S137 41925
S138 70297
S139 241951
S14 164789
S140 20706
S141 96494
S142 28144
S143 229699
S145 58180
S15 126667
S16 31290
S17 159843
S18 40422
S2 134381
S21 9807
S24 68727
S25 104068
S26 93601
S28 39612
S29 61703
S3 19223
S31 68789
S33 22368
S34 128892
S35 94267
S37 109080
S38 91271
S39 73717
S4 130053
S40 57703
S42 111092
S43 20532
S44 56180
S46 61731
S47 123685
S48 18249
S50 79458
S51 68707
S52 11937
S53 117322
S54 125443
S55 126926
S56 86093
S57 118198
S58 76489
S59 137027
S60 88624
S61 122507
S62 149594
S63 84432
S64 110986
S65 172466
S66 49984
S67 201620
S68 51369
S69 102936
S71 107829
S72 98095
S74 101384
S75 151991
S77 77881
S78 158062
S79 84411
S8 59270
S80 26477
S81 20615
S82 224827
S83 158696
S84 228773
S85 206129
S86 39624
S88 22891
S89 118506
S9 62794
S90 30077
S92 47402
S93 182728
S94 39212
S95 11159
S96 176939
S97 262579
S98 303543


phy <- subset_samples(phy, !is.na(Inflammation))
phy <- subset_samples(phy, sample_sums(phy) > 5000)
phy
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4048 taxa and 114 samples ]
sample_data() Sample Data:       [ 114 samples by 5 sample variables ]
tax_table()   Taxonomy Table:    [ 4048 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4048 tips and 4041 internal nodes ]
total = median(sample_sums(phy))
standf = function(x, t = total) round(t * (x/sum(x)))
M.std = transform_sample_counts(phy, standf)

M.std = filter_taxa(M.std, function(x) sum(x > 10) > (0.02 * length(x)) | sum(x) > 
    0.001 * total, TRUE)
# M.f = filter_taxa(M.std,function(x) sum(x) > 0.001*total, TRUE)
ntaxa(M.std)
[1] 1047
require(cluster)
library(factoextra)
df1 <- data.frame(otu_table(M.std))
df_viz <- fviz_nbclust(df1, kmeans, method = "wss") + theme_minimal()
res_fanny <- fanny(t(df1), k = 3, metric = "SqEuclidean")

res_fanny <- pam(t(df1), k = 3)

sample_data(M.std)$cluster <- as.character(res_fanny$clustering)

# clusters<-NULL memb_prob<-res_fanny$membership
# sample_ids<-rownames(memb_prob) for (i in 1:dim(memb_prob)[1]) {
# memb_prob_<-max(memb_prob[i,]) tmp<-data.frame(sample_ids[i], memb_prob_)
# clusters<-rbind(clusters, tmp) }
require(microbiomeSeq)
require(pheatmap)
phyTop <- taxa_level(M.std, "G_species")
TopASVs <- names(sort(taxa_sums(phyTop), TRUE))[1:50]
df1 <- as.data.frame(otu_table(t(phyTop)))
df1 <- df1[which(rownames(df1) %in% TopASVs), ]
df1 <- df1[which(rownames(df1) != " "), ]
df <- data.frame((sample_data(phyTop))[, c("Inflammation", "BV")])
p <- pheatmap(log2(df1 + 1), cluster_rows = FALSE, show_rownames = TRUE, show_colnames = F, 
    cluster_cols = TRUE, annotation_col = df, annotation_row = NA)
p

p <- plot_richness(M.std, x = "cluster", color = "cluster", measures = c("Observed", 
    "Simpson"), title = paste0("Standardized to total reads, N=", nsamples(M.std)))
p <- p + geom_boxplot() + geom_point() + theme_bw() + theme(axis.text = element_text(size = 10, 
    face = "bold"), axis.title = element_text(size = 16, face = "bold")) + geom_point(size = 2)
p

p <- plot_richness(M.std, x = "Inflammation", color = "Inflammation", measures = c("Observed", 
    "Simpson"), title = paste0("Standardized to total reads, N=", nsamples(M.std)))
p <- p + geom_boxplot() + geom_point() + theme_bw() + theme(axis.text = element_text(size = 10, 
    face = "bold"), axis.title = element_text(size = 16, face = "bold")) + geom_point(size = 2)
p

est <- estimate_richness(M.std, split = TRUE, measures = c("Simpson"))
BV_div <- cbind(est, sample_data(M.std)[, "BV"])
t <- wilcox.test(BV_div$Simpson ~ BV_div$BV)
t

    Wilcoxon rank sum test with continuity correction

data:  BV_div$Simpson by BV_div$BV
W = 467, p-value = 5.493e-11
alternative hypothesis: true location shift is not equal to 0
BInflammation_div <- cbind(est, sample_data(M.std)[, "Inflammation"])
t <- wilcox.test(BInflammation_div$Simpson ~ BInflammation_div$Inflammation)
t

    Wilcoxon rank sum test with continuity correction

data:  BInflammation_div$Simpson by BInflammation_div$Inflammation
W = 2420, p-value = 2.206e-06
alternative hypothesis: true location shift is not equal to 0
Cluster_div <- cbind(est, sample_data(M.std)[, "cluster"])
dunn.test::dunn.test(Cluster_div$Simpson, Cluster_div$cluster)
  Kruskal-Wallis rank sum test

data: x and group
Kruskal-Wallis chi-squared = 60.6833, df = 2, p-value = 0

                           Comparison of x by group                            
                                (No adjustment)                                
Col Mean-|
Row Mean |          1          2
---------+----------------------
       2 |  -6.491687
         |    0.0000*
         |
       3 |  -1.269108   6.204012
         |     0.1022    0.0000*

alpha = 0.05
Reject Ho if p <= alpha/2
# ordination based on NMDS and bray-curtis distance
NMDS_bray <- ordinate(M.std, "NMDS")
Square root transformation
Wisconsin double standardization
Run 0 stress 0.1998966 
Run 1 stress 0.2130476 
Run 2 stress 0.2171235 
Run 3 stress 0.2102853 
Run 4 stress 0.2095962 
Run 5 stress 0.2181037 
Run 6 stress 0.219483 
Run 7 stress 0.2124743 
Run 8 stress 0.2054231 
Run 9 stress 0.225887 
Run 10 stress 0.2165795 
Run 11 stress 0.2104146 
Run 12 stress 0.2189583 
Run 13 stress 0.2021484 
Run 14 stress 0.222584 
Run 15 stress 0.2246392 
Run 16 stress 0.2129191 
Run 17 stress 0.2092465 
Run 18 stress 0.209589 
Run 19 stress 0.2216952 
Run 20 stress 0.2298562 
*** No convergence -- monoMDS stopping criteria:
    20: stress ratio > sratmax
title = c("NMDS of 16S microbiome, Bray-Curtis distance")

NMDS.bray.plot <- plot_ordination(M.std, NMDS_bray, color = "BV", shape = "Inflammation", 
    title = title)
NMDS.bray.plot <- NMDS.bray.plot + theme(axis.text = element_text(size = 16, 
    face = "bold"), axis.title = element_text(size = 18, face = "bold"), legend.title = element_text(size = 14)) + 
    theme_bw() + labs(color = "BV") + geom_point(size = 5)
NMDS.bray.plot

PCoA.ord.bray <- ordinate(M.std, "PCoA", "bray")
title = c("PCoA of 16S microbiome, Bray-Curtis distance")
PCoA.ord <- plot_ordination(M.std, PCoA.ord.bray, color = "cluster", shape = "BV", 
    title = title)
PCoA.ord <- PCoA.ord + theme(axis.text = element_text(size = 16, face = "bold"), 
    axis.title = element_text(size = 18, face = "bold"), legend.title = element_text(size = 14)) + 
    theme_bw() + labs(color = "cluster") + geom_point(size = 5)
PCoA.ord

# permanova
require(vegan)
phy_bray <- phyloseq::distance(M.std, method = "bray")
sampledf <- data.frame(sample_data(M.std))

adonis(phy_bray ~ BV, data = sampledf)

Call:
adonis(formula = phy_bray ~ BV, data = sampledf) 

Permutation: free
Number of permutations: 999

Terms added sequentially (first to last)

           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
BV          1     6.973  6.9729  24.445 0.17916  0.001 ***
Residuals 112    31.948  0.2852         0.82084           
Total     113    38.921                 1.00000           
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beta <- betadisper(phy_bray, sampledf$BV)
permutest(beta)

Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999

Response: Distances
           Df  Sum Sq   Mean Sq      F N.Perm Pr(>F)
Groups      1 0.00321 0.0032139 0.1963    999  0.648
Residuals 112 1.83390 0.0163741                     
adonis(phy_bray ~ Inflammation, data = sampledf)

Call:
adonis(formula = phy_bray ~ Inflammation, data = sampledf) 

Permutation: free
Number of permutations: 999

Terms added sequentially (first to last)

              Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
Inflammation   1     3.595  3.5948  11.397 0.09236  0.001 ***
Residuals    112    35.326  0.3154         0.90764           
Total        113    38.921                 1.00000           
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beta <- betadisper(phy_bray, sampledf$Inflammation)
permutest(beta)

Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999

Response: Distances
           Df  Sum Sq   Mean Sq      F N.Perm Pr(>F)
Groups      1 0.00128 0.0012817 0.0985    999  0.741
Residuals 112 1.45685 0.0130076                     
adonis(phy_bray ~ cluster, data = sampledf)

Call:
adonis(formula = phy_bray ~ cluster, data = sampledf) 

Permutation: free
Number of permutations: 999

Terms added sequentially (first to last)

           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
cluster     2    17.106  8.5530   43.52 0.43951  0.001 ***
Residuals 111    21.815  0.1965         0.56049           
Total     113    38.921                 1.00000           
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beta <- betadisper(phy_bray, sampledf$BV)
permutest(beta)

Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999

Response: Distances
           Df  Sum Sq   Mean Sq      F N.Perm Pr(>F)
Groups      1 0.00321 0.0032139 0.1963    999  0.651
Residuals 112 1.83390 0.0163741                     
sampledf$cluster <- as.factor(sampledf$cluster)
logit_model <- glm(Inflammation ~ cluster + BMI + Age, family = binomial(link = "logit"), 
    data = sampledf)
summary(logit_model)

Call:
glm(formula = Inflammation ~ cluster + BMI + Age, family = binomial(link = "logit"), 
    data = sampledf)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8426  -0.7226  -0.6225   0.9218   1.8866  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  0.54362    2.95825   0.184    0.854    
cluster2    -2.52439    0.64857  -3.892 9.93e-05 ***
cluster3    -0.61320    0.63878  -0.960    0.337    
BMI          0.04109    0.05115   0.803    0.422    
Age         -0.02309    0.14450  -0.160    0.873    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 141.14  on 102  degrees of freedom
Residual deviance: 116.86  on  98  degrees of freedom
  (11 observations deleted due to missingness)
AIC: 126.86

Number of Fisher Scoring iterations: 4
exp(logit_model$coefficients)
(Intercept)    cluster2    cluster3         BMI         Age 
 1.72223820  0.08010712  0.54161358  1.04194406  0.97717878 
exp(confint(logit_model))
                  2.5 %      97.5 %
(Intercept) 0.004740475 573.4388037
cluster2    0.020390051   0.2672824
cluster3    0.144419698   1.8330961
BMI         0.942085319   1.1536758
Age         0.734927320   1.3023893
anova(logit_model, test = "Chisq")
Analysis of Deviance Table

Model: binomial, link: logit

Response: Inflammation

Terms added sequentially (first to last)

        Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
NULL                      102     141.14              
cluster  2  23.6062       100     117.54 7.481e-06 ***
BMI      1   0.6494        99     116.89    0.4203    
Age      1   0.0255        98     116.86    0.8731    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Differential abundance using DESEQ2
require(DESeq2)
M.f.des <- taxa_level(M.std, "G_species")
dds <- phyloseq_to_deseq2(M.f.des, ~Inflammation)
# geometric mean, set to zero when all coordinates are zero
geo_mean_protected <- function(x) {
    if (all(x == 0)) {
        return(0)
    }
    exp(mean(log(x[x != 0])))
}
geoMeans <- apply(counts(dds), 1, geo_mean_protected)
dds <- estimateSizeFactors(dds, geoMeans = geoMeans)
dds <- DESeq(dds, fitType = "local")
res = results(dds, cooksCutoff = FALSE)
df <- as.data.frame(colData(dds)[, "BV"])
colnames(df) <- "BV"

alpha = 0.01
sigtab = as.data.frame(res[which(res$padj < alpha & abs(res$log2FoldChange) > 
    1.5), ])
# sigtab = cbind(as(sigtab, 'data.frame'),
# as(tax_table(M.f.des)[rownames(sigtab), ], 'matrix'))
dim(sigtab)
[1] 18  6
table_sig <- sigtab[c("log2FoldChange", "padj")]
table_sig$Abundant_Group <- levels(df$BV)[as.numeric(sigtab$log2FoldChange > 
    0) + 1]
table_sig %>% kable(booktabs = TRUE, longtable = TRUE) %>% kable_styling(latex_options = c("hold_position", 
    "repeat_header", position = "")) %>% scroll_box(width = "100%", height = "300px")
log2FoldChange padj Abundant_Group
Sneathia uncultured bacterium -27.343938 0.0000000 Negative
Sneathia -3.831081 0.0033540 Negative
Sneathia Sneathia amnii -2.463017 0.0002363 Negative
Streptococcus Streptococcus anginosus subsp. anginosus 3.611588 0.0048922 Positive
Gemella Gemella asaccharolytica -2.533120 0.0030237 Negative
Lactobacillus 2.105281 0.0050189 Positive
Anaerococcus uncultured Anaerococcus sp. 2.097034 0.0030237 Positive
Fastidiosipila Clostridiales bacterium oral clone MCE3_9 1.869674 0.0025170 Positive
Fastidiosipila unidentified -3.753272 0.0011412 Negative
Shuttleworthia uncultured bacterium -6.049291 0.0000000 Negative
Veillonella 2.832508 0.0051103 Positive
Megasphaera -3.167728 0.0000000 Negative
Gardnerella -2.850634 0.0094426 Negative
Sutterella Sutterella sp. Marseille-P3161 -5.242801 0.0065728 Negative
Porphyromonas Bacteroidales bacterium KA00251 -4.475867 0.0001637 Negative
Prevotella -5.120501 0.0000000 Negative
Prevotella 6 Prevotella sp. S7-1-8 -6.571508 0.0000000 Negative
Prevotella Prevotella sp. DNF00663 -4.039743 0.0048922 Negative
library("pheatmap")
select <- order(rowMeans(counts(dds, normalized = TRUE)), decreasing = TRUE)[1:50]
df <- as.data.frame(colData(dds)[, c("Inflammation", "BV", "cluster")])

df1 <- data.frame(assay(dds))[select, ]
p <- pheatmap(log2(df1 + 1), cluster_rows = FALSE, show_rownames = TRUE, show_colnames = F, 
    cluster_cols = TRUE, annotation_col = df, annotation_row = NA)
p

require(randomForest)
data.rf <- data.frame(otu_table(M.f.des))
data.rf$BV <- sample_data(M.std)$BV
BV.rf <- randomForest(BV ~ ., data = data.rf, importance = TRUE, proximity = TRUE)
df_accuracy <- as.data.frame(BV.rf$importance)
df_accuracy$Taxa <- rownames(df_accuracy)
df_accuracy <- df_accuracy[order(df_accuracy$MeanDecreaseAccuracy, decreasing = TRUE), 
    ][1:20, ]
df_accuracy$Taxa <- factor(df_accuracy$Taxa, levels = df_accuracy$Taxa)
mda_plot <- ggplot(data = df_accuracy, aes(x = Taxa, y = MeanDecreaseAccuracy)) + 
    theme_bw()
mda_plot <- mda_plot + geom_bar(stat = "identity", fill = "darkblue", width = 0.5)
mda_plot <- mda_plot + theme(axis.text.x = element_text(angle = 90, hjust = 1, 
    vjust = 0.5))
mda_plot <- mda_plot + xlab("") + ylab("Mean Decrease in Accuracy") + coord_flip() + 
    theme(axis.text = element_text(size = 10, face = "bold"), axis.title = element_text(size = 16, 
        face = "bold"))
mda_plot

data.rf_reg <- data.frame(otu_table(M.f.des))
data.rf$BMI <- sample_data(M.f.des)$BMI
BMI.reg <- randomForest(BMI ~ ., data = data.rf, importance = TRUE, proximity = TRUE, 
    na.action = na.omit)
# BMI.reg$importance View(BMI.reg$importance)
R version 3.4.1 (2017-06-30)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib

locale:
[1] C

attached base packages:
[1] stats4    parallel  stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] randomForest_4.6-14        DESeq2_1.16.1             
 [3] SummarizedExperiment_1.6.5 DelayedArray_0.2.7        
 [5] matrixStats_0.54.0         Biobase_2.36.2            
 [7] GenomicRanges_1.28.6       GenomeInfoDb_1.12.3       
 [9] IRanges_2.10.5             S4Vectors_0.14.7          
[11] BiocGenerics_0.22.1        vegan_2.5-4               
[13] lattice_0.20-38            permute_0.9-5             
[15] pheatmap_1.0.12            microbiomeSeq_0.1         
[17] factoextra_1.0.5           cluster_2.0.7-1           
[19] plotly_4.9.0               ggplot2_3.2.0             
[21] phyloseq_1.25.2            kableExtra_1.1.0          
[23] rmdformats_0.3.5           knitr_1.23                

loaded via a namespace (and not attached):
 [1] backports_1.1.4      uuid_0.1-2           Hmisc_4.2-0         
 [4] plyr_1.8.4           igraph_1.2.4         lazyeval_0.2.2      
 [7] sp_1.3-1             splines_3.4.1        BiocParallel_1.10.1 
[10] rncl_0.8.3           digest_0.6.19        foreach_1.4.4       
[13] htmltools_0.3.6      gdata_2.18.0         memoise_1.1.0       
[16] checkmate_1.9.3      magrittr_1.5         annotate_1.54.0     
[19] Biostrings_2.44.2    readr_1.3.1          gmodels_2.18.1      
[22] prettyunits_1.0.2    colorspace_1.4-1     blob_1.1.1          
[25] rvest_0.3.4          ggrepel_0.8.1        xfun_0.8            
[28] dplyr_0.8.0.1        crayon_1.3.4         RCurl_1.95-4.12     
[31] jsonlite_1.6         genefilter_1.58.1    phylobase_0.8.6     
[34] survival_2.44-1.1    iterators_1.0.10     ape_5.3             
[37] glue_1.3.1           gtable_0.3.0         zlibbioc_1.22.0     
[40] XVector_0.16.0       webshot_0.5.1        seqinr_3.4-5        
[43] questionr_0.7.0      adegraphics_1.0-15   scales_1.0.0        
[46] DBI_1.0.0            miniUI_0.1.1.1       Rcpp_1.0.1          
[49] viridisLite_0.3.0    xtable_1.8-4         progress_1.2.2      
[52] spData_0.3.0         htmlTable_1.13.1     bit_1.1-14          
[55] foreign_0.8-71       spdep_0.7-9          Formula_1.2-3       
[58] htmlwidgets_1.3      httr_1.4.0           RColorBrewer_1.1-2  
[61] acepack_1.4.1        pkgconfig_2.0.2      XML_3.98-1.20       
[64] nnet_7.3-12          deldir_0.1-16        locfit_1.5-9.1      
[67] labeling_0.3         AnnotationDbi_1.38.2 tidyselect_0.2.5    
[70] rlang_0.3.1          reshape2_1.4.3       later_0.8.0         
[73] munsell_0.5.0        adephylo_1.1-11      tools_3.4.1         
 [ reached getOption("max.print") -- omitted 52 entries ]

Acknowledgement

  • H3ABioNet
  • Uganda Virus Research Institute (UVRI)
  • MUII-plus most especially the BCB Hub
  • UCT-CBiO

References

UVRI-H3ABioNet

2019-07-08